<?php
// PHP Regex Spider v. 2.0
// Copyright 2004 Frank Mitchell. All rights reserved.
// http://thiefsystems.org/ccs/phpregexspider
include("utilities.php");

// Where should we start searching from?
$start = 'http://thiefsystems.org/ccs/';
$start = 'http://ptyagi13.googlepages.com/priyanka\'shomepage';
$start = 'http://itzmbmzcorner.blogspot.com';
$start = 'http://www.iit.edu';
$start = 'http://www.umich.edu';
$myhost = "iit.edu";
$no_of_links = 800;

// What should we be looking for?
$search = '~<title>(.*?)<\/title>~';
$hsearch = '~<h1>(.*?)<\/h1>~';

// Do you want to follow query links?
$follow_queries = false;

// Do you want to convert 'http://www.' to 'http://' ?
$convert_www = false;

// Do you want to convert HTML entries?
// Setting this to true may cause the spider to seg. fault when it
// encounters pages with malformed HTML code.
$convert_html = false;
$stay_in_domain = false;
$c_depth = 1;
$max_depth = 4;

// What kinds of files and schemes should we avoid?
$dont_follow = array('jpg', 'gif', 'png', 'ico', 'zip', 'rar', 'tar', 'gz',
'c|', 'c', 'pl', 'py', 'js', 'jar', 'reg', 'orig', 'exe', 'java', 'class',
'css', 'xml', 'txt', 'dvi', 'ps', 'lot', 'doc', 'ppt', 'pdf', 'lit', 'mp3',
'wav', 'ra', 'pm', 'mpg', 'mpeg', 'mso', 'psd', 'swf', 'img', 'vhdl', 'dat',
'cpp', 'cls', 'tex', 'clq', 'mailto', 'javascript', 'news', 'feed', 'file');

// Build information about the site we're going to search.
if($url = parse_url($start))
{
    if(isset($url['scheme'], $url['host']))
    {
        $b_scheme = $url['scheme'];
        $b_host = $url['host'];
    }
}
else
{
    echo("\nError!\n");
    echo('Description: Unable to parse starting URL. ');
    echo("Please enter a different URL to start from.\n");
    echo("Starting URL: " .$start. "\n\n");
    exit;
}

// Initialize our array of links.
$links = array($start => 0);
$depths = array($start => 0);
$tags = array($start => "");
$titles = array();
$headings = array();

// Initialize our array of search results.
$gold = array();
$gold_url = array();
$link_count = 0;

// Keep crawling until we run out of links.
while($p_link = array_search(0, $links))
{

    // Mark this link as having been seen.
    $links[$p_link] = 1;

    // Get the contents of the link we're currently looking at.
    // If we fail this, there's no point in going further.
    // Remove the @ symbol if you want to see all warnings for pages that
    // could not be retreived.
    if(@ $contents = file_get_contents($p_link))
    {

        // Convert any HTML characters we find, including quotes.
        if($convert_html)
        {
            $contents = html_entity_decode($contents, ENT_QUOTES);
        }

        // What link are we following?
        //MBM echo('Following link: '.$p_link."\n");

        // Build information about the link we're currently looking at.
        unset($url, $p_url, $p_scheme, $p_host, $p_path);
        if($url = parse_url($p_link))
        {
            $p_url = $p_link;
            if(isset($url['scheme']))
            {
                $p_scheme = $url['scheme'];
                $p_url = $p_scheme.'://';
            }
            if(isset($url['host']))
            {
                $p_host = $url['host'];
                $p_url .= $p_host;
            }
            if(isset($url['path']))
            {
                $p_path = dirname($url['path']);
                $p_url .= $p_path;

                // Remove leading and trailing slashes from our path.
                $p_path_end = strlen($p_path);
                if($p_path_end > 0)
                {
                    $p_path_end--;
                    if($p_path{0} == '/')
                    {
                        $p_path{0} = '';
                    }
                    if($p_path{$p_path_end} == '/')
                    {
                        $p_path{$p_path_end} = '';
                    }
                }
            }

            // Add a trailing slash to our URL if one doesn't exist.
            if($p_url{strlen($p_url) - 1} != '/')
            {
                $p_url .= '/';
            }
        }

        // Extract all the search matches from the current page.
        preg_match_all($search, $contents, $search_results);
        preg_match_all($hsearch, $contents, $hsearch_results);

        for($i = 0; $i < count($hsearch_results[1]); $i++)
        {
            $hresult = $hsearch_results[1][$i];
            echo "Heading is ". $hresult . "\n";
	}

        // Put the search results into our pot of gold.
        for($i = 0; $i < count($search_results[1]); $i++)
        {
            $result = $search_results[1][$i];
            if(array_search($p_link, $gold_url) === false)
            {
		$url_host = getUrlHost($url['host']);
		$url_dir = getDirPath($url['path']);
		echo "PATH = " . $url_host.$url_dir . "\n";
    		tagIt($p_link, $url_host.$url_dir);

		echo "TITLE = ". $result . "\n";
                $title_tag = getNiceName($result);
                if($title_tag != "") $tags[$p_link] = $tags[$p_link]."/". $title_tag;
                //$gold[] = $tags[$p_link];
    	        //$gold_url[] = $p_link;
		$titles[$p_link] = $title_tag;
    		tagIt($p_link, $tags[$p_link]);
            }
        }

        if($depths[p_link]>= $max_depth)
           continue;

        // Extract the links from the current page.
        preg_match_all('~href *= *(\'|")(.*?)\1~i', $contents, $link_results);

        // Loop through our extracted links and manipulate them.
        for($i = 0; $i < count($link_results[2]); $i++)
        {

            // Get an extracted link from our list.
            $c_link = $link_results[2][$i];

            // Decode the link in case it's been encoded.
            $c_link = urldecode($c_link);

            // Trim any whitespace that might be on our link.
            $c_link = trim($c_link);

            // Build information about our extracted link.
            // If we can't parse the URL, don't continue.
            unset($url);
            if($url = parse_url($c_link))
            {
                // Get the extension for this particular link.
                $c_ext = substr(strrchr($c_link, '.'), 1);
                $c_ext = strtolower($c_ext);

                // Skip links to files on our don't follow list.
                if($c_ext != '' && in_array($c_ext, $dont_follow))
                {
                    $c_link = '';
                }

                // If this link is external, we don't want to follow it.
                elseif(isset($url['scheme']))
                {
                    if(isset($url['host']) &&
                       strpos($url['host'], $b_host) === false)
                    {
			if($stay_in_domain === true)
                        	$c_link = '';
			else
				$c_depth = $max_depth-2;
                    }
                    elseif(in_array(strtolower($url['scheme']), $dont_follow))
                    {
                        $c_link = '';
                    }
		    else
			$c_depth = 1;
                }

                // Remove fragments from the end of a link.
                if($c_link != '' && isset($url['fragment']))
                {
                    $c_link = str_replace('#'.$url['fragment'], '', $c_link);
                }

                // Remove queries from the end of a link.
                if(!$follow_queries && $c_link != '' && isset($url['query']))
                {
                    $c_link = str_replace('?'.$url['query'], '', $c_link);
                }
            }
            else
            {
                // If we won't be able to follow it, mark it as bad.
                $c_link = '';
            }

            // If our link's made it this far, it's good, so let's keep it.
            if($c_link != '')
            {

                // We can skip any absolute links we've still got.
                if(strpos($c_link, 'http:') === false)
                {

                    // Case 1: The URL is of the form: /directory/file
                    if($c_link{0} == '/')
                    {
                        $c_link = $b_scheme.'://'.$b_host.$c_link;
                    }

                    // Case 2: The URL is of the form: ../directory/file
                    elseif($count = substr_count($c_link, '../'))
                    {
                        // Remove the relative bits from our link.
                        $c_link = str_replace('../', '', $c_link);  

                        // Backtrack the required number of directories.
                        $path_array = explode('/', $p_path);
                        $new_path = '';
                        for($j = $count; $j > 0; $j--)
                        {
                            array_pop($path_array);
                        }
                        for($j = 0; $j < count($path_array); $j++)
                        {
                            $new_path = $new_path.$path_array[$j].'/';
                        }
                        $new_path .= $c_link;

                        // Assemble the correct path for our link.
                        $c_link = $p_scheme.'://'.$p_host.'/'.$new_path;
                    }


                    // Case 3: The URL is of the form: ./directory/file
                    elseif(strpos($c_link, './') !== false)
                    {
                        $c_link = str_replace('./', '', $c_link);
                        $c_link = $p_url.$c_link;
                    }

                    // Case 4: The URL is of the form: file 
                    else
                    {
                        $c_link = $p_url.$c_link;
                    }
                }

                // Remove any www. stuff from the start of our link.
                if($convert_www)
                {
                    $c_link = str_replace('http://www.', 'http://', $c_link);
                }

		//MBM: Add all the trimming rules here to make sure similar links are not added
                $c_link = rtrim($c_link, "\/");
                // Add our extracted list to our list of links to look at.
                if(!array_key_exists($c_link, $links))
                {
                    $links[$c_link] = 0;
                    $depths[$c_link] = $depths[$p_link]+$c_depth;
                    $tags[$c_link] = $tags[$p_link];
                }
            }
        }
        $link_count = $link_count+1;
    }
    else
    {
        // Mark this link as being unretrievable.
        $links[$p_link] = -1;
    }

    if($link_count===$no_of_links) break;
}


// How many links did we end up finding vs. searching?
$count = array_count_values($links);
if(!isset($count[-1]))
{
    $count[-1] = 0;
}
$count[2] = $count[1] + $count[-1];

echo("\nTotal number of links found was ".$count[2].".");
echo("\nTotal number of links searched was ".$count[1].".");
echo("\nTotal number of bad links was ".$count[-1].".\n\n");

// What kind of search results did we get?
$count = count($gold);

echo("\nSearch results: \n\n");
for($i = 0; $i < $count; $i++)
{
    echo("<br/>\n");
    echo($depths[$gold_url[$i]]. "# ");
    echo($gold_url[$i]. " ");
//    tagIt($gold_url[$i], $gold[$i]);
}


echo("\nTotal number of search results found was ".$count.".\n\n");

function tagIt($gold_url, $gold)
{

    $gold = trim($gold);
    $gold = htmlspecialchars($gold);
    $gold_url = htmlspecialchars($gold_url);
    //echo "\n".$gold_url." ".$gold."\n";

    $patterns = array ('/(\s*\|\s*)|(\s*-\s*)|(\s*\/\s*)|(\s*::\s*)|(\s*:\s*)/');
    $replace = array ('/');
    ksort($patterns); ksort($replace);
    $gold =  preg_replace($patterns, $replace, $gold);
    

    $pats = array( 	"/Illinois\sInstitute\sof\sTechnology/"	=> "IIT"	,
			"/IIT\sChicago/"			=> "IIT"	,
			"/\sDepartment\//"			=> "/"		,
			"/CS\sDept/"				=> ""		,
			"/http/"				=> ""		,
			"/Untitled/"				=> ""		,
			"/Document/"				=> ""		,
			"/Computer Science\s/"			=> "CS "	,
			"/,\s*and/"				=> "and"	,
			"/\/\s*and/"				=> ""		,
			"/\/\s*And/"				=> ""		,
			"/Forwarded Link/"			=> ""		,
			"/Redirecting\sto/"			=> ""		,
			"/Welcome\sto\s/"			=> ""		,
			"/Welcome\s/"				=> ""		,
			"/\/Welcome/"				=> ""		,
			"/\/\s*http/"				=> ""		,
			"/Today\s/"				=> ""		,
			"/Chicago\s/"				=> ""		,
			"/\sof\sIIT/"				=> ""		,
			"/\sat\sIIT/"				=> ""		,
			"/\/Home$/"				=> ""		,
			"/\/Home\//"				=> ""		,
			"/\/Home\sPage/"			=> "/"		,
			"/\sMain\sPage/"			=> ""		,
			//"/+*\//"				=> ""		,
//			"/-*\//"				=> ""		,
//			"/\.*/"					=> ""		,
//			"/!*\//"				=> ""		,
			"/the\s/"				=> ""		,
			"/\/\s\//"				=> "/"		,
			"/The\s/"				=> ""		);
    $gold =  preg_replace(array_keys($pats), array_values($pats), $gold);

    $patterns = array ('/\/IIT\/IIT$/',  // /IIT/IIT ==> /IIT
                       '/\S\/IIT$/',     // */IIT$  ==> *
                       '/IIT\s/'       // IIT_   ==> IIT/
                      );
    $replace = array ('/IIT', '\\0', 'IIT/');
    ksort($patterns); ksort($replace);
    $gold =  preg_replace($patterns, $replace, $gold);

    $string = $gold;
   // $string = preg_replace("/([\/])/"," \\1",$string);
    $parts = explode("/",$string);
    $unique = array_unique($parts);
    $unique = implode("/",$unique);
    //$unique = preg_replace("/\s([\/])/","\\1",$unique); 
    $gold = $unique;

    $patterns = array ('/\/\//', '/\/\/\//');
    $replace = array ('/', '/');
    ksort($patterns); ksort($replace);
    $gold =  preg_replace($patterns, $replace, $gold);

    $gold = trim($gold);
    //echo("<br/>\n");
    $command  = "tag add \"" .$gold_url. "\" ";
    $command .= "\"" .$gold. "\"";
    exec($command);
    echo($command);
    echo "\n";
}

?>
